■ matplotlib
matplotlibのめっちゃまとめ - Qiita
https://qiita.com/nkay/items/d1eb91e33b9d6469ef51
matplotlib + seaborn — Pythonでグラフ描画 - Heavy Watal
https://heavywatal.github.io/python/matplotlib.html
matplotlib > gallery
https://matplotlib.org/stable/gallery/index.html
■ seaborn
Example gallery:
https://seaborn.pydata.org/examples/
seabornの細かい見た目調整をあきらめない
https://qiita.com/skotaro/items/7fee4dd35c6d42e0ebae
seabornでMatplotlibの見た目を良くする
https://note.nkmk.me/python-matplotlib-seaborn-basic/
#====================================================
# ■ plot [matplotlib]
# matplotlib入門
# http://bicycle1885.hatenablog.com/entry/2014/02/14/023734
# Matplotlib: 作図
# http://www.turbare.net/transl/scipy-lecture-notes/intro/matplotlib/matplotlib.html
#
# 早く知っておきたかったmatplotlibの基礎知識、あるいは見た目の調整が捗るArtistの話
# https://qiita.com/skotaro/items/08dc0b8c5704c94eafb9
# - matplotlibにはグラフを作る際の二つの流儀がある
#
#====================================================
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
px = np.arange(-3, 3, 0.1)
py = np.sin(px)
# a) オブジェクト指向インターフェース
fig, ax = plt.subplots(figsize=(4,3)) #
ax.set_title('Plot')
ax.plot(px,py) # matplotlib.lines.Line2D object
fig = plt.figure(figsize=(4,3))
ax = fig.add_subplot(1,1,1) # ax <= rows, cols, indicates
ax.set_title('Plot')
ax.plot(px, py)
# b) plt.なんとかで全部済ませる流儀 (The pyplot API)
plt.figure(figsize=(4,3))
plt.title('Plot')
plt.plot(px, py)
plt.show()
### 重ね描き
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
px = np.arange(-3, 3, 0.1)
pys = np.sin(px)
pyc = np.cos(px)
plt.figure(figsize=(4,3)) # 描画サイズ、数字はインチ
plt.plot(px, pys, linestyle='-', linewidth=1, marker='+', markersize=6)
plt.plot(px, pyc, linestyle='-', linewidth=1, marker='x', markersize=6)
plt.show()
fig = plt.figure(figsize=(4,3))
ax = fig.add_subplot(1,1,1) # ax <= rows, cols, indicates
ax.plot(px, pys, linestyle='-', linewidth=1, marker='+', markersize=6)
ax.plot(px, pyc, linestyle='-', linewidth=1, marker='x', markersize=6)
plt.show()
# 散布図
# matplotlib.pyplot.scatter(x, y, s=20, c=None, marker='o', cmap=None, norm=None,
# vmin=None, vmax=None, alpha=None, linewidths=None,
# verts=None, edgecolors=None, hold=None, data=None,
# **kwargs)
# x, y グラフに出力するデータ
# s サイズ (デフォルト値: 20)
# c 色、または、連続した色の値
# marker マーカーの形 (デフォルト値: ‘o’= 円)
# cmap カラーマップ。c が float 型の場合のみ利用可能
# norm c を float 型の配列を指定した場合のみ有効。
# 正規化を行う場合の Normalize インスタンスを指定。
# vmin, vmax 正規化時の最大、最小値。 指定しない場合、データの最大・最小値となります。
# norm にインスタンスを指定した場合、vmin, vmax の指定は無視されます。
# alpha 透明度。0(透明)~1(不透明)の間の数値を指定。
# linewidths 線の太さ
# edgecolors 線の色
#
# Color Map
# Example:https://matplotlib.org/examples/color/colormaps_reference.html
from matplotlib import cm
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
x = np.arange(-3, 3, 0.1)
sy = np.sin(x)
cy = np.cos(x)
fig = plt.figure(figsize=(4,3))
ax = fig.add_subplot(111)
ax.scatter(x, sy, s=5, c='blue') # matplotlib.collections.PathCollection
ax.scatter(x, cy, s=5, c='red')
ax.set_title('scatter plot')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.grid(True)
plt.show()
#-------------
from sklearn import datasets
N = 300 # 全データ数
X, y = datasets.make_moons(N, noise=0.3)
Y = y.reshape(N, 1)
#=============================
# matplotlib
coldic={0:'red', 1:'blue'}
plt.scatter(X[:,0],X[:,1],c=[coldic[int(y)] for y in Y] )
plt.show()
#=============================
# seaborn
import seaborn as sns
sns.scatterplot(x=X[:,0], y=X[:,1], hue=y, palette='bright', # matplotlib.axes._subplots.AxesSubplot
marker='o', edgecolor=None, s=20, alpha=0.5)
<AxesSubplot:>
# ◆散布図行列 (Seabone)
# pairplot
# pairplot(data, hue=None, hue_order=None, palette=None, vars=None, x_vars=None, y_vars=None,
# kind='scatter', diag_kind='hist', markers=None, height=2.5, aspect=1, dropna=True,
# plot_kws=None, diag_kws=None, grid_kws=None)
# Return: seaborn.axisgrid.PairGrid
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
iris = sns.load_dataset("iris")
#print(iris.head(4))
sns.pairplot(iris, hue="species", height=2.5) # seaborn.axisgrid.PairGrid
plt.show()
# sepal_length sepal_width petal_length petal_width species
#0 5.1 3.5 1.4 0.2 setosa
#1 4.9 3.0 1.4 0.2 setosa
#2 4.7 3.2 1.3 0.2 setosa
#3 4.6 3.1 1.5 0.2 setosa
<class 'seaborn.axisgrid.PairGrid'>
# ◆Seaborn で散布図・回帰モデルを可視化する
# lmplot, regplot
# http://pythondatascience.plavox.info/seaborn/%E6%95%A3%E5%B8%83%E5%9B%B3%E3%83%BB%E5%9B%9E%E5%B8%B0%E3%83%A2%E3%83%87%E3%83%AB
# seaborn.lmplot
# https://seaborn.pydata.org/generated/seaborn.lmplot.html
# lmplot(x, y, data, hue=None, col=None, row=None, palette=None,
# col_wrap=None, size=5, aspect=1, markers='o', sharex=True,
# sharey=True, hue_order=None, col_order=None, row_order=None,
# legend=True, legend_out=True, x_estimator=None, x_bins=None,
# x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000,
# units=None, order=1, logistic=False, lowess=False, robust=False,
# logx=False, x_partial=None, y_partial=None, truncate=False,
# x_jitter=None, y_jitter=None, scatter_kws=None, line_kws=None)
# Return: seaborn.axisgrid.FacetGrid
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
iris = sns.load_dataset("iris")
sns.lmplot(x="petal_length", y="sepal_length", data=iris, # seaborn.axisgrid.FacetGrid
hue="species", markers=["+","x","."], fit_reg=False, legend=True)
plt.show()
<seaborn.axisgrid.FacetGrid object at 0x000001B73905FA30>
# KDE: カーネル密度推定 (kernel density estimate)
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
iris = sns.load_dataset("iris")
# Subset the iris dataset by species
setosa = iris.query("species == 'setosa'")
virginica = iris.query("species == 'virginica'")
versicolor = iris.query("species == 'versicolor'")
# Set up the figure
f, ax = plt.subplots(figsize=(8, 8))
ax.set_aspect("equal")
# Draw the two density plots
ax = sns.kdeplot(x=setosa.petal_length, y=setosa.sepal_length, # matplotlib.axes._subplots.AxesSubplot
cmap="Reds", shade=True, thresh=0.05)
ax = plt.scatter(x=setosa.petal_length, y=setosa.sepal_length,
marker="o")
ax = sns.kdeplot(x=virginica.petal_length, y=virginica.sepal_length,
cmap="Blues", shade=True, thresh=0.05)
ax = plt.scatter(x=virginica.petal_length, y=virginica.sepal_length,
marker="X")
ax = sns.kdeplot(x=versicolor.petal_length, y=versicolor.sepal_length,
cmap="Greens", shade=False, thresh=0.05)
ax = plt.scatter(x=versicolor.petal_length, y=versicolor.sepal_length,
marker="+")
# Add labels to the plot
red = sns.color_palette("Reds")[-2]
blue = sns.color_palette("Blues")[-2]
green = sns.color_palette("Greens")[-2]
ax = plt.text(2.0, 4.5, "setosa", size=16, color=red)
ax = plt.text(3.5, 7.2, "virginica", size=16, color=blue)
ax = plt.text(2.5, 6.0, "versicolor", size=16, color=green)
plt.show()
<class 'matplotlib.axes._subplots.AxesSubplot'>
# jointplot, JointGrid
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
iris = sns.load_dataset("iris")
# グラフの種類をkindで指定
# kind{ “scatter” | “kde” | “hist” | “hex” | “reg” | “resid” }
sns.jointplot(
data=iris, x="petal_length", y="sepal_length", hue="species", # seaborn.axisgrid.JointGrid
alpha=0.5, kind="scatter"
)
sns.jointplot(
data=iris, x="petal_length", y="sepal_length", kind='reg',
joint_kws={'scatter_kws':dict(s=8, alpha=0.5),
'line_kws' :dict(lw=1, color='black')},
marginal_kws=dict(bins=40)
)
# 内側と外側のグラフ種を指定
g = sns.JointGrid(data=iris, x="petal_length", y="sepal_length", hue="species")
g.plot(sns.kdeplot, sns.histplot)
# 内側と外側のグラフ種と各設定で指定
g = sns.JointGrid(data=iris, x="petal_length", y="sepal_length", hue="species")
g.plot_joint(sns.scatterplot, s=100, alpha=.5)
g.plot_marginals(sns.histplot, bins=20, multiple='dodge', kde=True)
#ガウス分布の確率密度関数,平均 0、標準偏差1
rx = np.random.normal(5, 2, 100).round()
ry = [r + np.random.normal() for r in rx]
g = sns.JointGrid(x=rx, y=ry)
g.plot_joint(sns.boxplot, color='cornflowerblue')
g.plot_marginals(sns.histplot, bins=25, kde=False)
g = sns.JointGrid(x=rx, y=ry)
g.plot_joint(sns.regplot, x_jitter=0.05, y_jitter=0, fit_reg=False,
scatter_kws={'alpha':0.3, 's':20})
g.plot_marginals(sns.histplot, bins=25, kde=False)
g = sns.JointGrid(x=rx, y=ry)
g.plot_joint(sns.stripplot, alpha=0.5)
g.plot_marginals(sns.histplot, bins=25, kde=False)
<seaborn.axisgrid.JointGrid at 0x2b5ea247e20>
import numpy as np
import pandas as pd
import math
from scipy import stats
from scipy.stats import norm, skew #for some statistics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
### Functions =================================================================
def xdescribe(df, dsp=True, ret=False): # pandas.describeメソッドの拡張
print(f'## Shape:{df.shape}')
dtype = pd.DataFrame({'dtype' : df.dtypes })
n_na = pd.DataFrame({'NAs' : df.isnull().sum()})
n_unique = pd.DataFrame({'n_unique': [len(df[c].unique()) for c in df.columns]},
index=df.columns)
df_descr = df.describe(include='all').T
if 'unique' in df_descr.columns:
df_descr = df_descr.drop(['unique'], axis=1)
df_descr = pd.concat([dtype, n_na, n_unique, df_descr], axis=1)
df_descr['count'] = df_descr['count'].astype(int)
if dsp:
# backup & restore sttting
mr = pd.get_option('display.max_rows') # backup sttting
pd.set_option('display.max_rows', None) # unlimitted
display(df_descr)
pd.set_option('display.max_rows', mr) # restore sttting
prt = True
for idx in df_descr.index:
if df_descr['dtype'].astype(str)[idx] == 'category':
if prt:
print(f'### Featur: Categories')
prt = False
print(f' {idx}: \t{df[idx].cat.categories}')
if ret:
return df_descr
else:
return
### Functions =================================================================
# 目的変数の分布を確認
def target_dist(df, target_colname):
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(df[target_colname])
skewness = skew(df[target_colname].dropna())
print( f'\n{target_colname}; 平均={mu:.2f}, 分散={sigma:.2f}, 歪度={skewness:.2f}')
# plot the distribution
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot(1, 2, 1)
ax.set_title(target_colname+' distribution')
sns.histplot(df[target_colname], stat='density', bins=20,
kde=True, line_kws={'linewidth':2, 'color':'black'})
t_min, t_max = df[target_colname].min(), df[target_colname].max()
sp = (t_max-t_min)/100
x = np.arange(t_min-sp*10, t_max+sp*10, sp)
y = norm.pdf(x, mu, sigma)
sns.lineplot(x=x, y=y, color='black')
# Q-Q plot
ax = fig.add_subplot(1, 2, 2)
stats.probplot(df[target_colname], plot=plt)
plt.show()
### Functions =================================================================
# 目的変数(target)が連続、説明変数(Feature)がカテゴリ変数変数の場合
# カテゴリ変数の分布を確認
def plt_box_dist(df, target, feature):
n_unique = len(df[feature].unique())
font_size = 12 - n_unique / 5
fig_height = 1 + n_unique / 4
fig = plt.figure(figsize=(8,fig_height))
# boxplot
ax0 = plt.subplot2grid((1, 6), (0, 0), colspan=4)
sns.boxplot(y=df[feature], x=df[target], orient='h', color='#628CD2', width=0.7,
flierprops={'markersize':5,'markerfacecolor':'gray',
'markeredgecolor':'None',
'alpha':min(1, 1/np.log10(df.shape[0]/10+9))})
ax0.set_yticklabels(ax0.get_yticklabels(), size=font_size)
if pd.api.types.is_numeric_dtype(df[feature]):
ax0.invert_yaxis()
# distribution
ax1 = plt.subplot2grid((1, 6), (0, 4), colspan=2)
sns.countplot(y=df[feature], color='#628CD2')
ax1.set_yticklabels([])
ax1.set_ylabel('')
if pd.api.types.is_numeric_dtype(df[feature]): # 数値の場合には下から
ax1.invert_yaxis()
plt.show()
### Functions =================================================================
# 目的変数(target)が連続、説明変数(Feature)が連続(数値)変数の場合
# 目的関数との相関/分布をプロット
def plt_reg_dist(df, target, feature):
n_unique = len(df[feature].unique())
font_size = 10
fig_height = 3
fig = plt.figure(figsize=(8,fig_height))
# regplot
ax0 = plt.subplot2grid((1, 6), (0, 0), colspan=4)
sns.regplot(data=df, x=target, y=feature, color='#3050A0',
scatter_kws={'alpha':1/np.log10(df.shape[0]+9), 's':12},
line_kws={'lw':1, 'color':'black'})
# distribution
ax1 = plt.subplot2grid((1, 6), (0, 4), colspan=2)
sns.histplot(data=df, y=feature, stat='count', bins=30, color='#628CD2',
shrink=1.0, kde=True)
ax1.set_yticklabels([])
ax1.set_ylabel('')
plt.show()
### Function =================================================================
# 予測誤差(true-pred)プロット
def true_pred(df, true_colname, pred_colname):
fig, ax = plt.subplots(figsize=(5,5))
xymin = min(df[true_colname].min(), df[pred_colname].min())
xymax = max(df[true_colname].max(), df[pred_colname].max())
pitch = (xymax-xymin)/100
sns.scatterplot(data=df, x=true_colname, y=pred_colname,
color='mediumblue', alpha=1/np.log10(df.shape[0]+9))
ax.set_xlabel(true_colname)
ax.set_ylabel(pred_colname)
ax.set_xlim([xymin-pitch*10, xymax+pitch*10])
ax.set_ylim([xymin-pitch*10, xymax+pitch*10])
x = np.arange(xymin-pitch* 10, xymax+pitch*10, pitch)
sns.lineplot(x=x, y=x, color='red', linewidth=1, linestyle='--')
x = np.arange(xymin-pitch* 5, xymax+pitch* 5, pitch)
reg = LinearRegression().fit(df[[true_colname]], df[pred_colname])
sns.lineplot(x=x, y=reg.coef_[0]*x+reg.intercept_, color='red', linewidth=2)
RMSE = np.sqrt(mean_squared_error(df[true_colname], df[pred_colname]))
R2 = r2_score(df[true_colname], df[pred_colname])
plt.text(x=0.1, y=0.85, transform=ax.transAxes, fontsize=12,
s=f'RMSE = {RMSE:.3f}\nR2 = {R2:.3f}')
### ===========================================================================
### ===========================================================================
### Main
df = sns.load_dataset('tips') # seaborn tipsデータセット
df_xdesc = xdescribe(df, ret=True) # 各変数の属性を表示
target_colname = 'tip' # 目的変数を指定
print(f'\n## 目的変数の分布を確認')
target_dist(df, target_colname)
print(f'\n## 説明変数の分布を確認')
feature_colnames = [c for c in df.columns if c !=target_colname]
for i, colname in enumerate(feature_colnames):
print(f'\n## {i+1}/{len(feature_colnames)}, {colname}')
if str(df_xdesc.loc[colname, 'dtype']) in ['object', 'category']:
plt_box_dist(df, target_colname, colname)
else:
plt_reg_dist(df, target_colname, colname)
if df_xdesc.loc[colname, 'n_unique'] <= 30:
plt_box_dist(df, target_colname, colname)
# regression
# データフレームを学習・評価、説明変数(特徴量)と目的変数に分割
df_train, df_test = train_test_split(df, train_size=0.75, random_state=123)
X_train = df_train.drop(target_colname, axis = 1)
y_train = df_train[target_colname]
X_test = df_test.drop(target_colname, axis = 1)
y_test = df_test[target_colname]
lgb_train = lgb.Dataset(X_train, y_train) # 学習用
lgb_test = lgb.Dataset(X_test, y_test) # 評価用
# パラメータの設定
params = {'objective':'regression', 'metric':['rmse', 'r2'], 'verbosity':-1}
# LightGBMで学習
lgb_model = lgb.train(params, lgb_train)
# 評価データセットで予測
y_pred = lgb_model.predict(X_test)
df_pred = pd.DataFrame({'true': y_test, 'predicted': y_pred})
print('## LightGBM regression結果')
# 予測誤差(true-pred)プロット
true_pred(df_pred, 'true', 'predicted')
## Shape:(244, 7)
| dtype | NAs | n_unique | count | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| total_bill | float64 | 0 | 229 | 244 | NaN | NaN | 19.785943 | 8.902412 | 3.07 | 13.3475 | 17.795 | 24.1275 | 50.81 |
| tip | float64 | 0 | 123 | 244 | NaN | NaN | 2.998279 | 1.383638 | 1.0 | 2.0 | 2.9 | 3.5625 | 10.0 |
| sex | category | 0 | 2 | 244 | Male | 157 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| smoker | category | 0 | 2 | 244 | No | 151 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| day | category | 0 | 4 | 244 | Sat | 87 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| time | category | 0 | 2 | 244 | Dinner | 176 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| size | int64 | 0 | 6 | 244 | NaN | NaN | 2.569672 | 0.9511 | 1.0 | 2.0 | 2.0 | 3.0 | 6.0 |
### Featur: Categories sex: Index(['Male', 'Female'], dtype='object') smoker: Index(['Yes', 'No'], dtype='object') day: Index(['Thur', 'Fri', 'Sat', 'Sun'], dtype='object') time: Index(['Lunch', 'Dinner'], dtype='object') ## 目的変数の分布を確認 tip; 平均=3.00, 分散=1.38, 歪度=1.46
## 説明変数の分布を確認 ## 1/6, total_bill
## 2/6, sex
## 3/6, smoker
## 4/6, day
## 5/6, time
## 6/6, size
## LightGBM regression結果
# 目的変数が連続、Featureが連続変数の場合
# Featue毎に、目的関数との相関をプロット
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
def plt_scatter(target, feature):
xmargin = ( max(feature) - min(feature) ) * 0.03
ymargin = ( max(target ) - min(target ) ) * 0.03
plt.xlim([min(feature)-xmargin, max(feature)+xmargin])
plt.ylim([min(target )-ymargin, max(target )+ymargin])
return sns.regplot(x=feature, y=target,
scatter_kws={'alpha':0.4, 's':10},
line_kws={'lw':1, 'color':'black'})
# ボストンデータセット
boston = datasets.load_boston()
df_X = pd.DataFrame(boston.data, columns=boston.feature_names)
sr_y = pd.Series(boston.target, name='PRICE')
# データセットの初めからN列をプロット
n_features = 5
for i, feature_name in enumerate(boston.feature_names[0:n_features]):
print(i, feature_name)
if i%2+1 <= 1:
fig = plt.figure(figsize=(8,3))
ax = plt.subplots_adjust(wspace=0.3)
ax = fig.add_subplot(1, 2, i%2+1)
ax = plt_scatter(sr_y, df_X[feature_name])
if i%2+1 >= 2:
plt.show()
plt.show()
0 CRIM 1 ZN
2 INDUS 3 CHAS
4 NOX
# 目的変数(target)が連続、説明変数(Feature)が連続(数値)変数の場合
# Featue毎に、目的関数との相関、および、回帰直線を求めて残差をプロット
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
def plt_regplot(target, feature):
xmargin = ( max(feature) - min(feature) ) * 0.03
ymargin = ( max(target ) - min(target ) ) * 0.03
plt.xlim([min(feature)-xmargin, max(feature)+xmargin])
plt.ylim([min(target )-ymargin, max(target )+ymargin])
return sns.regplot(x=feature, y=target,
scatter_kws={'alpha':0.5, 's':12},
line_kws={'lw':1, 'color':'black'})
boston = datasets.load_boston()
df_X = pd.DataFrame(boston.data, columns=boston.feature_names)
df_y = pd.DataFrame(boston.target, columns=['Price'])
df = pd.concat([df_X,df_y],axis=1)
n_features = 3
print(f'length: {len(boston.feature_names)} Numeric Features')
target = 'Price'
for feature in boston.feature_names[0:3]:
x = np.array(df[feature]).reshape(-1,1)
y = np.array(df[target])
y_hat = LinearRegression().fit(x,y).predict(x) ## 直線回帰
y_err = y - y_hat ## 残差
idx1 = np.argmax(np.abs(y_err))
y_err2 = y_err.copy()
y_err2[idx1] = 0.0
idx2 = np.argmax(np.abs(y_err2))
std = np.std(y_err)
print(f'The candidates of outlier:',
f'{feature}: {idx1}:{df.loc[idx1, feature]}, {idx2}:{df.loc[idx2, feature]}')
fig = plt.figure(figsize=(10,3))
ax = plt.subplots_adjust(wspace=0.3)
#### Prediction - actual
ax = fig.add_subplot(1, 2, 1)
ax = plt_regplot(df[target].astype(np.float64), df[feature].astype(np.float64))
ax = plt.text(df.loc[idx1, feature], df.loc[idx1, target], str(idx1),
ha='right', va='bottom', color='red')
ax = plt.text(df.loc[idx2, feature], df.loc[idx2, target], str(idx2),
ha='right', va='bottom', color='red')
#### residuals
ax = fig.add_subplot(1, 2, 2)
ax = plt.hlines( 3*std, np.min(x),np.max(x), ls='solid', lw=1, color='red')
ax = plt.hlines(-3*std, np.min(x),np.max(x), ls='solid', lw=1, color='red')
ax = plt.text(np.max(x),3*std, '3 sigma', ha='right', va='top', color='red', size=10)
ax = plt.text(df.loc[idx1, feature], y_err[idx1], str(idx1),
ha='right', va='bottom', color='red')
ax = plt.text(df.loc[idx2, feature], y_err[idx2], str(idx2),
ha='right', va='bottom', color='red')
ax = plt.scatter(x=x, y=y_err, alpha=0.3, s=10)
ax = plt.hlines( 0, np.min(x),np.max(x), ls='solid', lw=1.5, color='black')
plt.show()
length: 13 Numeric Features The candidates of outlier: CRIM: 371:9.2323, 372:8.26725
The candidates of outlier: ZN: 161:0.0, 162:0.0
The candidates of outlier: INDUS: 161:19.58, 162:19.58
# 3次元散布図
#
# http://www.python.ambitious-engineer.com/archives/883
# mplot3d tutorial
# https://matplotlib.org/mpl_toolkits/mplot3d/tutorial.html
# mpl_toolkits.mplot3d.axes3dというライブラリを追加でインポート
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D
n=20
x = np.arange(n*n)//n
y = np.arange(n*n)%n
#z = (x+y)/2 + (np.random.rand(n*n)-0.5)*2
z = (x+y)/2 + np.sin(x+y)
v = np.sqrt( (x-(n-1)/2)**2 + (y-(n-1)/2)**2 + (z-(n-1)/2)**2 )/n
#print(x,y,z,v)
# 散布図を表示、各点の色にvalueを指定する
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
p = ax.scatter(x, y, z, c=v, cmap='Blues')
# カラーバーを表示
fig.colorbar(p, shrink=0.8)
plt.show()
# 3次元サーフェス
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
# Make data.
n=20
X = np.arange(n)
Y = np.arange(n)
X, Y = np.meshgrid(X, Y)
Z = (X+Y)/2 + np.sin(X+Y)
#print(X[:3])
#print(Z[:3])
# Plot the surface.
surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
linewidth=0, antialiased=False)
# Add a color bar which maps values to colors.
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()
# 2 次元等高線図(コンター図)(塗りつぶし)
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
xs = np.arange(-5, 5, 1)
z = np.array([x**2 + y**2 for x in xs for y in xs]).reshape((len(xs), len(xs)))
print(xs)
print(z)
plt.contourf(xs, xs, z, cmap=cm.jet)
plt.colorbar()
plt.show()
[-5 -4 -3 -2 -1 0 1 2 3 4] [[50 41 34 29 26 25 26 29 34 41] [41 32 25 20 17 16 17 20 25 32] [34 25 18 13 10 9 10 13 18 25] [29 20 13 8 5 4 5 8 13 20] [26 17 10 5 2 1 2 5 10 17] [25 16 9 4 1 0 1 4 9 16] [26 17 10 5 2 1 2 5 10 17] [29 20 13 8 5 4 5 8 13 20] [34 25 18 13 10 9 10 13 18 25] [41 32 25 20 17 16 17 20 25 32]]
# 2次元、3次元配列の可視化
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
x = np.array([1,1,1,1,1, 2,2,2,2,2, 3,3,3,3,3, 4,4,4,4,4, 5,5,5,5,5])
y = np.array([1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5])
z = np.array([5,6,7,5,2, 3,5,6,7,6, 3,5,4,3,2, 4,5,5,4,2, 4,4,4,3,3])
# 2D Scatter; Heatmap 的なもの
fig = plt.figure(figsize=(4,3))
p = plt.scatter(x, y, c=z, linewidths=0, alpha=.8, cmap=cm.jet)
fig.colorbar(p)
plt.show()
# 塗りつぶし2次元等高線図
print(np.reshape(z,(5,5)).T)
fig = plt.figure(figsize=(4,3))
plt.contourf([1,2,3,4,5], [1,2,3,4,5], np.reshape(z,(5,5)).T, cmap=cm.jet)
plt.colorbar()
plt.show()
# 塗りつぶし等高線図の3次元表示
fig = plt.figure(figsize=(5,4))
ax = fig.add_subplot(projection='3d')
p = ax.contourf([1,2,3,4,5], [1,2,3,4,5], np.reshape(z,(5,5)).T, cmap=cm.jet)
fig.colorbar(p, shrink=0.5)
plt.show()
# 3D plot
# Plot the Scatter.
fig = plt.figure(figsize=(5,4))
ax = fig.add_subplot(projection='3d')
p = ax.scatter(x, y, z, c=z, cmap=cm.jet)
fig.colorbar(p, shrink=0.5)
plt.show()
# Plot the surface.
x = np.arange(1,6)
y = np.arange(1,6)
x, y = np.meshgrid(x, y)
fig = plt.figure(figsize=(5,4))
ax = fig.add_subplot(projection='3d')
surf = ax.plot_surface(x, y, np.reshape(z,(5,5)).T, cmap=cm.jet,
linewidth=0, antialiased=False)
fig.colorbar(surf, shrink=0.5, aspect=5)
plt.show()
[[5 3 3 4 4] [6 5 5 5 4] [7 6 4 5 4] [5 7 3 4 3] [2 6 2 2 3]]
# 2次元ヒストグラム
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
x, y = np.vstack((np.random.multivariate_normal([0, 0], [[10.0, 0],[0,20]], 5000)
,np.random.multivariate_normal([0,15], [[10.0, 0],[0, 5]], 5000))).T
fig = plt.figure()
ax = fig.add_subplot(111)
H = ax.hist2d(x,y, bins=40, cmap=cm.jet)
ax.set_title('2D Histgram')
ax.set_xlabel('x')
ax.set_ylabel('y')
fig.colorbar(H[3],ax=ax)
plt.show()
# matplotlib でヒストグラムを描く
# http://pythondatascience.plavox.info/matplotlib/%E3%83%92%E3%82%B9%E3%83%88%E3%82%B0%E3%83%A9%E3%83%A0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib import cm
from numpy.random import *
seed(100)
x = np.random.normal(50, 10, 1000) # 平均 50, 標準偏差 10 の正規乱数を 1,000件生成
y = np.random.normal(80, 15, 700) # 平均 80, 標準偏差 15 の正規乱数を 700件生成
z = np.random.normal(5, 1, 1000).astype(int)*10
c = np.random.choice(['A','B','C'], 1000, p=[0.2, 0.5, 0.3]) # p: probabilities
# matplotlib.pyplot.hist(x, bins=10, range=None, normed=False, weights=None,
# cumulative=False, bottom=None, histtype='bar',
# align='mid', orientation='vertical', rwidth=None,
# log=False, color=None, label=None, stacked=False,
# hold=None, data=None, **kwargs)
# ヒストグラムを出力
fig = plt.figure(figsize=(8,3)) ## (w,h)
ax = fig.add_subplot(1, 2, 1)
ax.hist(x, bins=20, ec='black')
ax.set_title('Histogram')
ax.set_xlabel('X')
ax.set_ylabel('y')
ax = fig.add_subplot(1, 2, 2)
ax.hist(x, bins=20, color='skyblue', cumulative=True, ec='blue')
ax.set_title('Cumulative Histogram')
ax.set_xlabel('X')
#fig.show()
fig = plt.figure(figsize=(12,3)) ## (w,h)
# オーバーラップ
ax = fig.add_subplot(1, 3, 1)
ax.hist(x, bins=20, label='X', color='blue', alpha=0.5)
ax.hist(y, bins=20, label='Y', color='red', alpha=0.5)
ax.legend()
# 積み上げ
ax = fig.add_subplot(1, 3, 2)
ax.hist([x,y], bins=20, label=['X','Y'], color=['blue','red'], stacked=True)
ax.legend()
# 並べ
ax = fig.add_subplot(1, 3, 3)
ax.hist([x,y], bins=20, label=['X','Y'], color=['blue','red'], stacked=False)
ax.legend()
plt.show()
df = pd.DataFrame({'x':x, 'c':c})
# Seaborn
fig, ax = plt.subplots(figsize=(6,3))
sns.histplot(data=df, x='x', hue='c', stat='count', bins=10, multiple='dodge', shrink=0.8, kde=True)
plt.show()
fig, ax = plt.subplots(figsize=(6,3))
sns.countplot(x='z', hue='c', data=pd.DataFrame({'z':z, 'c':c}))
plt.show()
# barh 水平棒グラフに数値コメント
import matplotlib.pyplot as plt
x = [10,20,15,18]
y = list(reversed(range(len(x))))
label = ['Aaa','Bbb','Ccc','Ddd']
plt.barh(y, x, align = 'center', xerr=[1,2,3,4], ecolor="black", capsize=10)
for i in y:
plt.text(x[i], y[i]+0.2, '{:.3f}'.format(x[i]), ha='left', va='center')
plt.yticks(y, label)
plt.show()
# barplot 棒グラフにエラーバーと数値記入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame({
'x' : ['AA']*3 + ['BB']*3 + ['CC']*3,
'y' : [8,10,12, 23,np.nan,16, 17,18,19]
})
display(df.T)
# plot bar-chart with error bars
fig, ax = plt.subplots(figsize=(6,3))
ax.set_title('sns.barplot with error bar and text')
ax = sns.barplot(
data = df, y = 'x', x = 'y', orient = 'h',
ci = 'sd', errcolor ='black', errwidth = 1.5, capsize = 0.3,
)
# plot texts of mean value
mean = df.groupby('x')['y'].agg('mean')
for i in range(len(mean)):
ax.text(
x = mean[i], y = i-0.15,
s = f'{mean[i]:.2f}'.format(), ha='left', va='bottom'
)
plt.show()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
|---|---|---|---|---|---|---|---|---|---|
| x | AA | AA | AA | BB | BB | BB | CC | CC | CC |
| y | 8.0 | 10.0 | 12.0 | 23.0 | NaN | 16.0 | 17.0 | 18.0 | 19.0 |
# barplot
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.DataFrame({
'A' : ['CC','BB','AA','DD'],
'B1' : [0.92,0.94,0.96,0.98],
'B2' : [0.95,0.97,0.93,0.99]
})
df = pd.melt(df.sort_values(['B1']), ## pivot_wider
id_vars=['A'], value_vars=['B1','B2'],
var_name='B_key', value_name='B_val')
display(df)
sns.barplot(data=df, x='B_val', y='A', hue='B_key')
plt.xlim(0.9,1.0)
plt.legend(bbox_to_anchor = (1.05, 1), # legendの原点位置をplot枠の外側
loc = 'upper left', # legendの原点位置
borderaxespad = 0)
plt.show()
| A | B_key | B_val | |
|---|---|---|---|
| 0 | CC | B1 | 0.92 |
| 1 | BB | B1 | 0.94 |
| 2 | AA | B1 | 0.96 |
| 3 | DD | B1 | 0.98 |
| 4 | CC | B2 | 0.95 |
| 5 | BB | B2 | 0.97 |
| 6 | AA | B2 | 0.93 |
| 7 | DD | B2 | 0.99 |
# カテゴリ変数; BOXプロットのバリエーション
# boxenプロット、ジッター散布図, バイオリンプロット
import seaborn as sns
import matplotlib.pyplot as plt
tips = sns.load_dataset("tips")
print(tips.head())
fig = plt.figure(figsize=(12,4)) ## (w,h)
ax = fig.add_subplot(1, 2, 1)
ax.set_title('boxplot')
ax = sns.boxplot(data=tips, x="day", y="total_bill", hue="smoker", width=0.8, palette="muted")
ax = fig.add_subplot(1, 2, 2)
ax.set_title('boxenplot')
ax = sns.boxenplot(data=tips, x="day", y="total_bill", hue="smoker", palette="muted")
plt.show()
fig = plt.figure(figsize=(12,4)) ## (w,h)
ax = fig.add_subplot(1, 2, 1)
ax.set_title('stripplot')
sns.stripplot(data=tips, x="day", y="total_bill", hue="smoker", palette="muted", alpha=0.5, dodge=True)
ax = fig.add_subplot(1, 2, 2)
ax.set_title('swarmplot')
sns.swarmplot(data=tips, x="day", y="total_bill", hue="smoker", palette="muted", s=4, dodge=True)
plt.show()
fig = plt.figure(figsize=(12,4)) ## (w,h)
ax = fig.add_subplot(1, 2, 1)
ax.set_title('violinplot')
ax = sns.violinplot(data=tips, x="day", y="total_bill", hue="smoker", palette="muted", split=True)
plt.show()
fig = plt.figure(figsize=(12,4)) ## (w,h)
ax = fig.add_subplot(1, 2, 1)
ax = sns.boxplot( data=tips, x="day", y="total_bill", hue="smoker", dodge=True, color='white')
ax = sns.stripplot(data=tips, x="day", y="total_bill", hue="smoker", dodge=True, s=6, alpha=0.4)
plt.show()
total_bill tip sex smoker day time size 0 16.99 1.01 Female No Sun Dinner 2 1 10.34 1.66 Male No Sun Dinner 3 2 21.01 3.50 Male No Sun Dinner 3 3 23.68 3.31 Male No Sun Dinner 2 4 24.59 3.61 Female No Sun Dinner 4
# Pillow (PIL Fork) https://pillow.readthedocs.io/en/stable/reference/Image.html#functions
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.collections as mc
%matplotlib inline
from PIL import Image # Pillow
img = Image.open('./Data/photo.jpg')
print('Format:',img.format, 'Size w,h:',img.size, 'Mode:',img.mode)
print(img.getextrema()) # the minimum and maximum pixel values for each band
img = np.array(img,'float32')/255
print('Shape h,w,c:',img.shape)
# Quick plot (The pyplot API)
#plt.imshow(img) # Matplotlib(H, W, CH)
#plt.show()
# The object-oriented API
x1, y1 = 100, 150
x2, y2 = 300, 350
### ec: edgecolor, fc: facecolor, lw: linewidth
r = patches.Rectangle((x1,y1), x2-x1,y2-y1, ec='red', lw=2, fill=False) # 矩形 (x,y), w, h
c = patches.Circle( xy=(300, 200), radius=100, lw=2, ec='blue', fill=False, fc=None ) # 円
e = patches.Ellipse(xy=(350, 400), width=150, height=100, ec='blue', fc='blue', alpha=0.1) # 楕円
# 複数の線分
lines = [[( 50, 80), (100, 120), (150, 50)],
[(150, 50), (200, 120)],
[(200, 50), (250, 120)]] # [[(x1, y2), (x2, y2), ...], ...]
lc = mc.LineCollection(lines,
colors=['red','orange','limegreen'],
linewidths=[2,3,4],
linestyles=['-','--','-'])
# 複数のポリゴン
polygons = [patches.Polygon([(300, 100), (350, 50), (500, 80), (450, 150)], closed=True),
patches.Polygon([(420, 120), (450, 20), (480, 80), (420, 150)], closed=True)]
pc = mc.PatchCollection(polygons,
linewidths =[ 2,3 ],
facecolors=['pink','limegreen' ], alpha=0.3,
edgecolors=['red', 'forestgreen'], zorder=2)
fig, ax = plt.subplots(figsize=(10,8))
#fig = plt.figure(figsize=(10,8))
#ax = plt.axes()
ax.grid(True)
ax.set_axisbelow(True)
ax.imshow(img)
ax.add_patch(r)
ax.add_patch(c)
ax.add_patch(e)
ax.add_collection(lc)
ax.add_collection(pc)
ax.axhline(120, color='yellow', linewidth=3, linestyle=':') # 水平線
ax.axvline(400, color='yellow', linewidth=3, linestyle='--') # 垂直線
ax.plot([100, 300], [350,150], color='skyblue') # 線分 [x1, x2],[y1,y2]
plt.show()
Format: JPEG Size w,h: (520, 480) Mode: RGB ((0, 255), (0, 255), (0, 255)) Shape h,w,c: (480, 520, 3)
# pillow; 画像の反転(flip)、回転(rotate)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image # Pillow
def plt_img(img1, title1=None,
img2=None, title2=None, img3=None, title3=None, img4=None, title4=None):
n_img = 1
if not ((img2 is None) or (title2 is None)):
n_img = 2
if not ((img3 is None) or (title3 is None)):
n_img = 3
if not ((img4 is None) or (title4 is None)):
n_img = 4
fig = plt.figure(figsize=(16,6))
ax = fig.add_subplot(1, n_img, 1)
ax.set_title(title1)
ax.imshow(img1)
ax.grid(True)
if not ((img2 is None) or (title2 is None)):
ax = fig.add_subplot(1, n_img, 2)
ax.set_title(title2)
ax.imshow(img2)
ax.grid(True)
if not ((img3 is None) or (title3 is None)):
ax = fig.add_subplot(1, n_img, 3)
ax.set_title(title3)
ax.imshow(img3)
ax.grid(True)
if not ((img4 is None) or (title4 is None)):
ax = fig.add_subplot(1, n_img, 4)
ax.set_title(title4)
ax.imshow(img4)
ax.grid(True)
plt.show()
img = Image.open('./Data/photo.jpg')
print('Format:',img.format, 'Size w,h:',img.size, 'Mode:',img.mode)
print(img.getextrema()) # the minimum and maximum pixel values for each band
img_arr = np.array(img,'float32')/255
print('Shape h,w,c:',img_arr.shape)
plt_img(img_arr,'./Data/photo.jpg', np.fliplr(img_arr),'fliplr',
np.rot90(img_arr,1),'rot90', np.rot90(img_arr,3),'rot90') # rotate unticlockwise
Format: JPEG Size w,h: (520, 480) Mode: RGB ((0, 255), (0, 255), (0, 255)) Shape h,w,c: (480, 520, 3)
# cv2で読み込んだarrayをNotebookに表示
import cv2
from IPython import display
def imshow(img, format=".jpg", **kwargs):
"""ndarray 配列をインラインで Notebook 上に表示する。
"""
img = cv2.imencode(format, img)[1] ### <= format (.jpg)に合わせて変換する
img = display.Image(img, **kwargs)
display.display(img)
img = cv2.imread('./Data/photo.jpg')
imshow(img)
# ペアプロット、相関係数ヒートマップ、散布図+回帰直線、残差プロット
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
pd.set_option('display.width', 100)
pd.set_option('display.precision', 4)
# Wine Data
wine = pd.read_csv('./data/winequality-red.csv', sep=';', header=0)
print('Describe:\n', wine.describe(percentiles=[.5]), sep='')
sns.pairplot(wine, height=2.5)
plt.show()
wine_feature_names = list(wine.columns)
wine_feature_names.remove('quality')
# 各Featureを正規化(0~1)
wine_nr = wine
wine_nr[wine_feature_names] = wine_nr[wine_feature_names].apply(
lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
# 相関係数
print('\n相関係数を表示\n', wine_nr.corr())
fig = plt.figure(figsize=(10,6))
ax = fig.add_subplot(111)
ax.set_title('Pearson Correlation Coefficient')
sns.heatmap(wine_nr.corr(), annot=True, fmt='.2f', center=0, square=False, cmap=cm.seismic)
plt.yticks(rotation='horizontal')
plt.show()
# 各Featureとqualityとの相関をプロット
wine_nr_lf = pd.melt(wine_nr, id_vars=['quality'], value_vars=wine_feature_names)
print('Tidied\n', wine_nr_lf.head(4))
sns.lmplot(x='value', y='quality', data=wine_nr_lf, hue='variable', col='variable',
scatter_kws={'s':3}, y_jitter=0.02, col_wrap=4, height=2.5)
plt.show()
# 説明変数に "quality (品質スコア)以外すべて" を利用
X = wine_nr.drop("quality", axis=1).values
# 目的変数に "quality (品質スコア)" を利用
Y = wine['quality'].values
# 予測モデルを作成
from sklearn import linear_model
clf = linear_model.LinearRegression().fit(X, Y)
# 偏回帰係数
print('\n偏回帰係数')
display(pd.DataFrame({
"Name":wine_feature_names,
"Coefficients":clf.coef_,
"Abs_Coef":np.abs(clf.coef_)
}).sort_values(by='Abs_Coef', ascending=False) )
# 切片 (誤差)
print('Intercept: ', clf.intercept_)
## 残差分析
# 予測値
predictions = clf.predict(X)
# 残差
residuals = predictions - wine['quality']
# 残差の標準偏差
std = np.std(residuals)
print('Standard deviation (σ) of Residuals:',std)
plt.scatter(x=wine.index, y=residuals, s=3, c='blue', marker='.')
plt.xlabel('Index')
plt.ylabel('Residuals')
# y = 0、±σに直線を引く
plt.hlines(y=0, xmin=0, xmax=len(predictions), lw=2, color='red')
plt.hlines(y= std, xmin=0, xmax=len(predictions), lw=1, color='green')
plt.hlines(y=-std, xmin=0, xmax=len(predictions), lw=1, color='green')
plt.show()
wine_nr_rs = wine_nr
wine_nr_rs['residuals'] = residuals
wine_nr_rs_lf = pd.melt(wine_nr_rs, id_vars=['residuals'], value_vars=wine_feature_names)
print('Tidied\n', wine_nr_rs_lf.head(4))
sns.lmplot(x='value', y='residuals', data=wine_nr_rs_lf, hue='variable', col='variable',
scatter_kws={'s':3}, col_wrap=4, height=2.5)
plt.show()
Describe:
fixed acidity volatile acidity citric acid residual sugar chlorides \
count 1599.0000 1599.0000 1599.0000 1599.0000 1599.0000
mean 8.3196 0.5278 0.2710 2.5388 0.0875
std 1.7411 0.1791 0.1948 1.4099 0.0471
min 4.6000 0.1200 0.0000 0.9000 0.0120
50% 7.9000 0.5200 0.2600 2.2000 0.0790
max 15.9000 1.5800 1.0000 15.5000 0.6110
free sulfur dioxide total sulfur dioxide density pH sulphates alcohol \
count 1599.0000 1599.0000 1599.0000 1599.0000 1599.0000 1599.0000
mean 15.8749 46.4678 0.9967 3.3111 0.6581 10.4230
std 10.4602 32.8953 0.0019 0.1544 0.1695 1.0657
min 1.0000 6.0000 0.9901 2.7400 0.3300 8.4000
50% 14.0000 38.0000 0.9968 3.3100 0.6200 10.2000
max 72.0000 289.0000 1.0037 4.0100 2.0000 14.9000
quality
count 1599.0000
mean 5.6360
std 0.8076
min 3.0000
50% 6.0000
max 8.0000
相関係数を表示
fixed acidity volatile acidity citric acid residual sugar chlorides \
fixed acidity 1.0000 -0.2561 0.6717 0.1148 0.0937
volatile acidity -0.2561 1.0000 -0.5525 0.0019 0.0613
citric acid 0.6717 -0.5525 1.0000 0.1436 0.2038
residual sugar 0.1148 0.0019 0.1436 1.0000 0.0556
chlorides 0.0937 0.0613 0.2038 0.0556 1.0000
free sulfur dioxide -0.1538 -0.0105 -0.0610 0.1870 0.0056
total sulfur dioxide -0.1132 0.0765 0.0355 0.2030 0.0474
density 0.6680 0.0220 0.3649 0.3553 0.2006
pH -0.6830 0.2349 -0.5419 -0.0857 -0.2650
sulphates 0.1830 -0.2610 0.3128 0.0055 0.3713
alcohol -0.0617 -0.2023 0.1099 0.0421 -0.2211
quality 0.1241 -0.3906 0.2264 0.0137 -0.1289
free sulfur dioxide total sulfur dioxide density pH sulphates \
fixed acidity -0.1538 -0.1132 0.6680 -0.6830 0.1830
volatile acidity -0.0105 0.0765 0.0220 0.2349 -0.2610
citric acid -0.0610 0.0355 0.3649 -0.5419 0.3128
residual sugar 0.1870 0.2030 0.3553 -0.0857 0.0055
chlorides 0.0056 0.0474 0.2006 -0.2650 0.3713
free sulfur dioxide 1.0000 0.6677 -0.0219 0.0704 0.0517
total sulfur dioxide 0.6677 1.0000 0.0713 -0.0665 0.0429
density -0.0219 0.0713 1.0000 -0.3417 0.1485
pH 0.0704 -0.0665 -0.3417 1.0000 -0.1966
sulphates 0.0517 0.0429 0.1485 -0.1966 1.0000
alcohol -0.0694 -0.2057 -0.4962 0.2056 0.0936
quality -0.0507 -0.1851 -0.1749 -0.0577 0.2514
alcohol quality
fixed acidity -0.0617 0.1241
volatile acidity -0.2023 -0.3906
citric acid 0.1099 0.2264
residual sugar 0.0421 0.0137
chlorides -0.2211 -0.1289
free sulfur dioxide -0.0694 -0.0507
total sulfur dioxide -0.2057 -0.1851
density -0.4962 -0.1749
pH 0.2056 -0.0577
sulphates 0.0936 0.2514
alcohol 1.0000 0.4762
quality 0.4762 1.0000
Tidied
quality variable value
0 5 fixed acidity 0.2478
1 5 fixed acidity 0.2832
2 5 fixed acidity 0.2832
3 6 fixed acidity 0.5841
偏回帰係数
| Name | Coefficients | Abs_Coef | |
|---|---|---|---|
| 10 | alcohol | 1.7953 | 1.7953 |
| 1 | volatile acidity | -1.5820 | 1.5820 |
| 9 | sulphates | 1.5303 | 1.5303 |
| 4 | chlorides | -1.1227 | 1.1227 |
| 6 | total sulfur dioxide | -0.9239 | 0.9239 |
| 8 | pH | -0.5253 | 0.5253 |
| 5 | free sulfur dioxide | 0.3097 | 0.3097 |
| 0 | fixed acidity | 0.2824 | 0.2824 |
| 7 | density | -0.2435 | 0.2435 |
| 3 | residual sugar | 0.2384 | 0.2384 |
| 2 | citric acid | -0.1826 | 0.1826 |
Intercept: 5.71255299667039 Standard deviation (σ) of Residuals: 0.645575067069205
Tidied
residuals variable value
0 0.0329 fixed acidity 0.2478
1 0.1379 fixed acidity 0.2832
2 0.2099 fixed acidity 0.2832
3 -0.3061 fixed acidity 0.5841
#====================================================
# ■ Pandas Plot
# PythonでPandasのPlot機能を使えばデータ加工からグラフ作成までマジでシームレス
# https://qiita.com/hik0107/items/de5785f680096df93efa
# matplotlib (+ pandas) によるデータ可視化の方法 (3)
# https://qiita.com/ynakayama/items/9979258ac68cb669757a
# 配列の視覚化(ヒートマップ的な)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
n = 5
img = np.zeros((n,n,3),'float32') # プロット用配列
c0 = 1/(n*n)
c = 0.0
for y in range(n):
for x in range(n):
c += c0
img[y, x, :] = c
print(img[:, :, 0])
fig, ax = plt.subplots(figsize=(5,5))
ax.grid(False)
ax.imshow(img)
[[0.04 0.08 0.12 0.16 0.2 ] [0.24 0.28 0.32 0.36 0.4 ] [0.44 0.48 0.52 0.56 0.6 ] [0.64 0.68 0.72 0.76 0.8 ] [0.84 0.88 0.92 0.96 1. ]]
<matplotlib.image.AxesImage at 0x1ebff2f37f0>
# カラーサンプル
# List of named colors
# https://matplotlib.org/stable/gallery/color/named_colors.html
from matplotlib.patches import Rectangle
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
def plot_colortable(colors, title, sort_colors=True, emptycols=0):
cell_width = 212
cell_height = 22
swatch_width = 48
margin = 12
topmargin = 40
# Sort colors by hue, saturation, value and name.
if sort_colors is True:
by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgb(color))),
name)
for name, color in colors.items())
names = [name for hsv, name in by_hsv]
else:
names = list(colors)
n = len(names)
ncols = 4 - emptycols
nrows = n // ncols + int(n % ncols > 0)
width = cell_width * 4 + 2 * margin
height = cell_height * nrows + margin + topmargin
dpi = 72
fig, ax = plt.subplots(figsize=(width / dpi, height / dpi), dpi=dpi)
fig.subplots_adjust(margin/width, margin/height,
(width-margin)/width, (height-topmargin)/height)
ax.set_xlim(0, cell_width * 4)
ax.set_ylim(cell_height * (nrows-0.5), -cell_height/2.)
ax.yaxis.set_visible(False)
ax.xaxis.set_visible(False)
ax.set_axis_off()
ax.set_title(title, fontsize=24, loc="left", pad=10)
print(f'len:{len(names)}')
for i, name in enumerate(names):
#print(i, colors[name], name)
row = i % nrows
col = i // nrows
y = row * cell_height
swatch_start_x = cell_width * col
text_pos_x = cell_width * col + swatch_width + 7
ax.text(text_pos_x, y,
str(colors[name])+' '+name,
fontsize=14,
horizontalalignment='left',
verticalalignment='center')
ax.add_patch(
Rectangle(xy=(swatch_start_x, y-9), width=swatch_width,
height=18, facecolor=colors[name], edgecolor='0.7')
)
plt.show()
return fig
def plot_hex_colors(hex_colors, title, emptycols=0):
cell_width = 212
cell_height = 22
swatch_width = 48
margin = 12
topmargin = 40
# Sort colors by hue, saturation, value and name.
n = len(hex_colors)
ncols = 4 - emptycols
nrows = n // ncols + int(n % ncols > 0)
width = cell_width * 4 + 2 * margin
height = cell_height * nrows + margin + topmargin
dpi = 72
fig, ax = plt.subplots(figsize=(width / dpi, height / dpi), dpi=dpi)
fig.subplots_adjust(margin/width, margin/height,
(width-margin)/width, (height-topmargin)/height)
ax.set_xlim(0, cell_width * 4)
ax.set_ylim(cell_height * (nrows-0.5), -cell_height/2.)
ax.yaxis.set_visible(False)
ax.xaxis.set_visible(False)
ax.set_axis_off()
ax.set_title(title, fontsize=24, loc="left", pad=10)
print(f'len:{len(hex_colors)}')
for i in range(len(hex_colors)):
#print(f'{hex_colors[i]}')
row = i % nrows
col = i // nrows
y = row * cell_height
swatch_start_x = cell_width * col
text_pos_x = cell_width * col + swatch_width + 7
ax.text(text_pos_x, y, hex_colors[i], fontsize=14,
horizontalalignment='left',
verticalalignment='center')
ax.add_patch(
Rectangle(xy=(swatch_start_x, y-9), width=swatch_width,
height=18, facecolor=hex_colors[i], edgecolor='0.7')
)
plt.show()
return fig
hex_colors = [
# red yellow green cyan blue magenta
'#FF0000','#FF7F00','#FFFF00','#7FFF00','#00FF00','#00FF7F','#00FFFF','#007FFF','#0000FF','#7F00FF','#FF00FF','#FF009F',
'#8F0000','#8F5F00','#8F8F00','#5F8F00','#008F00','#008F5F','#008F8F','#005F8F','#00008F','#5F008F','#8F008F','#8F005F',
'#FF7F7F','#FFAF4F','#FFFF7F','#AFFF7F','#7FFF7F','#7FFFAF','#7FFFFF','#7FAFFF','#7F7FFF','#AF7FFF','#FF7FFF','#FF7FCF',
]
plot_hex_colors(hex_colors, "Hex Colors", emptycols=1)
color_names = ['red', 'darkorange', 'gold', 'olivedrab', 'forestgreen','green', 'darkcyan',
'blue', 'dodgerblue',
'blueviolet', 'purple', 'darkviolet','mediumvioletred']
plot_colortable({c:mcolors.CSS4_COLORS[c] for c in color_names},
"Original Color Seq.", sort_colors=False, emptycols=1)
color_names = ['pink', 'orange', 'yellow', 'yellowgreen', 'limegreen','green', 'lightseagreen',
'deepskyblue', 'turquoise',
'mediumpurple', 'magenta', 'violet','orchid']
plot_colortable({c:mcolors.CSS4_COLORS[c] for c in color_names},
"Original Color Seq.", sort_colors=False, emptycols=1)
plot_colortable(mcolors.BASE_COLORS, "Base Colors", sort_colors=False, emptycols=1)
plot_colortable(mcolors.TABLEAU_COLORS, "Tableau Palette", sort_colors=False, emptycols=2)
plot_colortable(mcolors.CSS4_COLORS, "CSS Colors")
plot_colortable(mcolors.cnames, "cnames Colors", sort_colors=False, emptycols=1)
# Optionally plot the XKCD colors (Caution: will produce large figure)
#xkcd_fig = plot_colortable(mcolors.XKCD_COLORS, "XKCD Colors")
#xkcd_fig.savefig("XKCD_Colors.png")
plt.show()
len:36
len:13
len:13
len:8
len:10
len:148
len:148
## 分散が異なる2群のt検定
## p値が0.05以下の場合には、優位水準5%で2群が等しいという仮説が棄却される
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from scipy import stats
%matplotlib inline
np.random.seed(0)
data1 = stats.norm.rvs(loc=0, scale=1, size=100)
np.random.seed(0)
data2 = stats.norm.rvs(loc=1, scale=2, size=100)
fig, ax = plt.subplots()
sns.histplot(x=data1, binwidth=0.5, color='blue', label='data1: mean=0, sd=1', kde=True)
sns.histplot(x=data2, binwidth=0.5, color='orange', label='data2: mean=1, sd=2', kde=True)
ax.set_title('Two Normal distribution random numbers Histogram')
ax.legend()
ax.set_xlabel('X')
plt.show()
stats.ttest_ind(data1, data2, equal_var=False)
Ttest_indResult(statistic=-4.67896719638844, pvalue=6.530806973739329e-06)
#============================================
# ネットワーク図
# ◆ NetworkX
#
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import random
import string
%matplotlib inline
random.seed(0)
n = 10
adj = [random.sample(range(n), 2) for _ in range(n*3)]
print(adj)
# Instantiate a Graph object form the list of edges as argument
g = nx.Graph(adj)
plt.figure(figsize=(4,4));
node_size = (np.array(g.nodes())+ 5)*30
node_color = (np.array(g.nodes())+10)*10
labels = dict(zip(g.nodes(), [string.ascii_uppercase[i] for i in g.nodes()]))
print(g.nodes(), node_size, node_color, labels)
nx.draw(g, cmap=plt.cm.winter, alpha=.8, font_size=20, width=.5,
node_size=node_size, node_color=node_color, labels=labels);
plt.axis('off');
plt.show()
[[6, 9], [0, 4], [8, 7], [6, 4], [7, 5], [9, 3], [8, 2], [4, 2], [1, 4], [8, 2], [4, 1], [1, 5], [7, 8], [1, 5], [6, 5], [9, 3], [8, 7], [7, 8], [4, 0], [8, 0], [1, 6], [0, 7], [5, 3], [5, 1], [3, 9], [3, 2], [8, 7], [1, 9], [5, 8], [7, 1]]
[6, 9, 0, 4, 8, 7, 5, 3, 2, 1] [330 420 150 270 390 360 300 240 210 180] [160 190 100 140 180 170 150 130 120 110] {6: 'G', 9: 'J', 0: 'A', 4: 'E', 8: 'I', 7: 'H', 5: 'F', 3: 'D', 2: 'C', 1: 'B'}